* NB1: comments of numbers in steps 1 & 2 of do file are not 100% exact anymore -- they are from May data using tbl_CourtAppln_plus_code_May2019.do, July data are slightly different but not much (I checked)
* NB2: this code does more than generating the variables necessary for ultimate analysis -- it also performs various checks of the data and produces other variables and intermediary datasets (if desired - now commented out)
* note on AEJ-RR revision: this version of the file keeps cases with outcomes other than grant/deny in the sample -- cf. line 196 below

log using data/EOIR_asylum/asylum_revised, replace // "revised" refers to AEJ-RR, see two lines up

**********************************************************
******** Step 1: Cleaning and investigating tbl_Court_Appln

*** importing data
import delimited data/EOIR_asylum/rawdata/tbl_Court_Appln.csv, clear varnames(1)
drop v7
drop if mi(idncase, idnproceeding, idnproceedingappln)
destring idn*, force replace

*** EOIR's May data dump had mixed up the variables names (confirmed by email from EOIR), necessitating the following: rename (idncase idnproceeding appl_code appl_dec appl_recd_date) (idnproceeding idncase appl_recd_date appl_code appl_dec)

gen recd_date = date(word(appl_recd_date,1),"YMD")
format recd_date %td

replace appl_code = strtrim(appl_code)
encode appl_code, gen(code) // ***list of codes in tblLookUp_Appln.csv***

replace appl_dec = strtrim(appl_dec)
tab appl_dec if !regexm(appl_dec,"[A-Z]") // 4 weird observations and ~20 runover dates (ingestion error)
encode appl_dec if regexm(appl_dec,"[A-Z]"), gen(dec) // *** cf. list of permitted decision codes in tblLookUp_Appln.csv ***

drop appl_*
rename (recd_date code dec) appl_=
drop if mi(appl_dec, appl_code)
compress

*** investigating duplicates, pt 1
isid idnproceedingappln // yes, it's unique
drop idnproceedingappln

duplicates report // 2+k duplicated obs (= 4+k total)
duplicates drop

* save tbl_Court_Appln_cleaned.dta, replace

*** investigating duplicates, pt 2

duplicates report idncase idnproceeding // only 1.25m idncase/proceeding entries unique, another 1m has two entries, rest more (max is 10)
duplicates report idncase idnproceeding appl_code // now mostly unique, but 10.5k have two entries, 200 have three, handful of 4 or 5
duplicates tag idncase idnproceeding appl_code, gen(dup)
tab appl_dec if dup>0 // wide variety
drop dup

*** investigating ASYW
preserve
keep if inlist(appl_code,"ASYL":code,"ASYW":code) // by tblLookUp_Appln.csv and tabulation of appl_code, these are the relevant ones -- only Q is what exactly ASYW means
duplicates drop idn* appl_code, force
reshape wide appl_dec appl_recd, i(idn*) j(appl_code) // "ASYL" will be 66, "ASYW" 67
rename *66 *_ASYL
rename *67 *_ASYW
* ASYW seems subsidiary: filed alongside ASYL in 70% of cases generally one same day, whereas only 3% are stand-alone ASYW appl
count // 1,689k
misstable sum appl_dec* // ASYL almost complete (53k missing), ASYW 445k missing
misstable pattern appl_dec* // only 3% of data have ASYW but not ASYL
count if !mi(appl_dec_ASYW, appl_dec_ASYL) // 1,191k
count if !mi(appl_dec_ASYW, appl_dec_ASYL) & appl_recd_date_ASYL== appl_recd_date_ASYW // 1,155k
* ASYW decisions mostly aligned with ASYL -- identical in 921k cases (~80%), 
count if !mi(appl_dec_ASYW, appl_dec_ASYL) & appl_recd_date_ASYL== appl_recd_date_ASYW & appl_dec_ASYL== appl_dec_ASYW // 921k
tab appl_dec* // mostly on the diagonal
tab appl_dec* if appl_recd_date_ASYL== appl_recd_date_ASYW // only sizeable off-diagonal entry that's weird is 19k ASYL=G(rant), ASYW=D(eported)
* but asylum grants much more frequent when ASYL alone!
tab appl_dec_ASYL // 274k G vs. 483k D
tab appl_dec_ASYL if !mi(appl_dec_ASYW, appl_dec_ASYL) & appl_recd_date_ASYL== appl_recd_date_ASYW & appl_dec_ASYL== appl_dec_ASYW // 36k G vs. 386k D !

*** bottom line: focus on asylum!
restore
keep if appl_code=="ASYL":code
drop appl_code
duplicates drop
duplicates report idn* appl_recd // 115 duplicated obs
duplicates report idn* // ~4k duplicated obs
* save tbl_Court_Appln_cleaned_asyl_only.dta, replace


**************************************************************
********* Step 2: Merging in additional information

*** preliminary: prepping main table for merge
gsort idn* -appl_recd_date
duplicates drop idn*, force // drop earlier case -- only about 4k
duplicates report idncase // 1.2m unique, rest duplicates up to ~10
duplicates report idnproc // none! --> multiple proceedings for same case (= person, see YB 2004 Glossary)?
tempfile bin
save `bin'

tempfile dec
label save dec using `dec', replace


*** A table
import delimited data/EOIR_asylum/rawdata/A_TblCase.csv, varnames(1) colrange(:23) clear
destring idncase, force replace
drop if mi(idncase)
isid idncase
keep idncase nat c_asy_type latest*
ds idncase, not
foreach var of varlist `r(varlist)' {
	cap replace `var' = strtrim(`var') // "cap" because some var may not be string
	}
encode nat, gen(nationality) // see clean-up below
tempfile nationality
label save nationality using `nationality', replace
replace c_asy_type = "" if !inlist(c_asy_type,"E","I") // E and I are the only values according to eoir_case_data_code_key and Ramji-Nogales et al. footnote 190
label define defensive 0 "I" 1 "E"
encode c_asy_type, gen(defensive) label(defensive)
replace latest_cal = "" if !inlist(latest_cal,"I","M")
encode latest_cal, gen(latest_cal_type_A)
count if word(latest_hearing,2)!="00:00:00.000" & !mi(latest_hearing) // 6
gen latest_hearing_A = date(word(latest_hearing,1),"YMD")
format latest_hearing_A %td
rename latest_time latest_time_A // can properly record as day time later if necessary
keep idncase nationality defensive *_A
compress
merge 1:m idncase using `bin', keep(2 3) nogenerate
save `bin', replace

*** B table
import delimited data/EOIR_asylum/rawdata/B_TblProceeding.csv, varnames(1) colrange(:31)  clear
keep idnproceeding idncase base_city_code hearing_loc_code ij_code dec_type dec_code comp_date case_type nat

destring idn*, force replace
drop if mi(idnproceeding, idncase)

ds idn*, not
foreach var of varlist `r(varlist)' {
	cap replace `var' = strtrim(`var') // "cap" because some var may not be string
	}
	
count if word(comp_date,2)!="00:00:00.000" & !mi(comp_date)
assert r(N)/_N<.0001 // 138 cases of comp_date having an hour
gen comp_date_B = date(word(comp_date,1),"YMD")
format comp_date_B %td

qui do `dec'
encode dec_code if regexm(dec_code,"[A-Z]"), gen(dec_code_B) label(dec)

qui do `nationality'
encode nat, gen(nationality) label(nationality)

encode ij, gen(judge_B)
encode base_city, gen(base_city_B) label(location)
encode hearing_loc, gen(hearing_loc_B) label(location)

label define oral 0 "W" 1 "O"
encode dec_type if !inlist(dec_type,"O","W"), gen(oral_B) label(oral) // O and W should be the only codes according to eoir_case_data_code_key

keep idn* *_B nationality
compress
merge 1:1 idn* using `bin', update replace nogenerate keep(2 3 4 5)

* clean up some nationality data (cf. tblLookupAlienNat.csv.: some codes are duplicates) 
replace nationality = "CC":nationality if nationality== "CB":nationality & nationality!=. // Kampuchea-->Cambodia
replace nationality = "PS":nationality if nationality== "PI":nationality & nationality!=. // Palau-->Republic of Palau
replace nationality = "SC":nationality if nationality== "SK":nationality & nationality!=. // St. Kiitts --> St. Kitts and Nevis
replace nationality = "DC":nationality if nationality== "CG":nationality & nationality!=. // Zaire --> Dem. Rep. congo
replace nationality = "BZ":nationality if nationality== "BS":nationality & nationality!=. // Byelarus --> Belarus
replace nationality = "NL":nationality if nationality== "HL":nationality & nationality!=. // Holland --> Netherlands
replace nationality = "SS":nationality if inlist(nationality,"NO":nationality,"??":nationality) // "no nationality", "unknown nationality"-->"stateless--alien unable to name a country"
* NB: there are some other arguable duplicates but they aren't safe to unify: e.g., East Germany and Germany (fleeing from communism vs. ...)

compress
order idncase idnproc base hearing_loc judge nat defen appl_recd appl_dec comp_date_B dec_code_B oral_B latest*
* save tbl_CourtAppln_plus, replace


************************************************************************************
****** Step 3: Probing the data, and limiting to grant/deny decisions

*** general checking
isid idnproceeding // idncase should probably be used for clustering if not clustering at higher level
* number of observations with (unique) location and date information
count if base==hearing_loc // 1.46m out of 1.65m
count if !mi(comp_date, latest_h) // 1.56m
count if comp_date == latest_hearing & !mi(comp_date, latest_h) // 704k
* is the completion date consistent with filing date? (yes)
count if appl_recd_date<comp_date & !mi(appl_recd_date, comp_date) // 1.56m -- this is how it should be
count if appl_recd_date>comp_date & !mi(appl_recd_date, comp_date) // 1,083 -- this can't be right in principle
* can we use oral format of hearing? (no)
tab oral // only 24k nonmissing
* which decision code to use
tab  appl_dec dec_code_B 
count if appl_dec==dec_code_B // unrelated -- only 56k overlap
count if inlist(appl_dec,"G":dec,"D":dec) // how many final decisions: grant vs deny (764k)


*** limit to grant/deny 1990-
keep if year(comp_date)>=1990
gen byte grant = 1 if appl_dec=="G":dec
replace grant = 0 if appl_dec=="D":dec
* keep if !mi(grant) // this is the AEJ-RR revision

* do decision numbers match EOIR Annual Reports?
gen fyear = year(comp_date) + inrange(month(comp_date),10,12) // cf. EOIR AR FY 2004 p. 10: FY runs Oct of preceding calendar year through September of calendar year
tab fyear grant // e.g., exact within 1% for grants and 0.1% for denials for FY 2000-2004 (2004 YB p. K2)
forvalues fy=1997/2019 { // 1997 is the first FY with data to compare in EOIR YB (2001, p. N1)
	di "grants by nationality -- fiscal year `fy':"
	tab nat if fyear==`fy' & grant==1 // they do match almost exactly: see, e.g., 2017 YB p. 29, 2004 YB p. J1
	}
drop fyear

* problem judges -- how many decisisons? (negligible: 1,079 decisions, all from transfers or generic names, not tests)
preserve
	* importing hand-compiled list of problematic judges
	tempfile judgecodes problemjudges
	label save judge_B using `judgecodes'
	use data/EOIR_asylum/problematic_judge_codes.dta, clear // I hand-compiled this list
	qui do `judgecodes'
	encode judge_code, gen(judge_B) label(judge_B)
	drop judge_code
	compress
	save `problemjudges'
restore, preserve
merge m:1 judge_B using `problemjudges', keep(1 3) nogenerate // 1,079 decisions from problem judges: generic codes (like "visiting judge"), named visiting judges ("Sam Smith (visiting judge)"), or clerical codes (test, transfer)
tab named_vis // 6 named visiting judges (now 8, after leaving in decisions other than grant/deny)
tab test_tr // 100 tests and transfers ... (now 4,328)
tab judge_name if test_transfer==1 // ... all of which are transfers
restore

* completion at latest hearing date? (NB: comp_date should be correct for FY assignment, cf. match of YB data to tab output above)
count if latest_hearing>comp_date & !mi(latest_hearing,comp_date)  // 77k -- this is weird but might be because of post-grant decisions such as relatives? Application for re-hearing? ...? (now 335k after leaving in decisions other than grand/deny)
count if latest_hearing<comp_date & !mi(latest_hearing,comp_date)  // 283k (now 502k)
count if latest_hearing==comp_date & !mi(latest_hearing,comp_date) // 354k (now 665k)


compress
save data/EOIR_asylum/asylum_revised, replace // "revised" means AEJ-RR version, i.e., including completions other than grant/deny
log close